import pandas as pd
df = pd.read_csv(r"C:\Users\lakshita rawat\Downloads\Project_2\Dataset_1.csv")
df.columns

Index(['Retailer', 'Months', 'Year', 'Invoice Date', 'Region', 'State', 'City',
       'Product', 'Price per Unit($)', 'Units Sold', 'Total Sales($)',
       'Operating Profit($)', 'Operating Margin', 'Sales Method'],
      dtype='object')

import matplotlib.pyplot as plt

# Count number of invoices per retailer (proxy for transactions)

transactions = df.groupby('Retailer')['Invoice Date'].nunique().sort_values(ascending=False)
ax = transactions.plot(kind='bar', figsize=(10,6), color="skyblue")
for i, v in enumerate(transactions):
    ax.text(i, v, f'{int(v)}', ha='center', va='bottom')

# Visualization
transactions.plot(kind='bar', figsize=(10,5), color='skyblue')
plt.title("Number of Transactions per Retailer")
plt.ylabel("Transactions")
plt.show()



# Highest revenue by the retailer

revenue = df.groupby('Retailer')['Total Sales($)'].sum().sort_values(ascending=False)
ax = revenue.plot(kind='bar', figsize=(10,6), color="skyblue")
for i, v in enumerate(revenue):
    ax.text(i, v, f'{int(v)}', ha='center', va='bottom')
# Visualization
revenue.plot(kind='bar', figsize=(10,5), color='lightgreen')
plt.title("Total Revenue per Retailer")
plt.ylabel("Revenue ($)")
plt.show()



# Operating Margin across the Retailers

margin = df.groupby('Retailer')['Operating Margin'].mean().sort_values(ascending=False)
print(margin)
# Visualization
margin.plot(kind='bar', figsize=(10,5), color='orange')
plt.title("Average Operating Margin per Retailer")
plt.ylabel("Operating Margin (%)")
plt.show()



# Units Sold VS Profit Generated

import seaborn as sns
units_vs_profit = df.groupby('Retailer').agg({
    'Units Sold':'sum',
    'Operating Profit($)':'sum'
}).sort_values(by='Operating Profit($)', ascending=False)
print(units_vs_profit)

# Visualization: Units vs Profit
plt.figure(figsize=(8,6))
sns.scatterplot(
    x='Units Sold', 
    y='Operating Profit($)', 
    data=units_vs_profit.reset_index(), 
    hue='Retailer', s=120
)
plt.title("Units Sold vs Operating Profit per Retailer")
plt.xlabel("Units Sold")
plt.ylabel("Operating Profit ($)")
plt.show()



## Sales contribution per retailer by region

region_retailer = df.groupby(['Region','Retailer'])['Total Sales($)'].sum().reset_index()
print(region_retailer)
plt.figure(figsize=(12,6))
sns.barplot(x='Region', y='Total Sales($)', hue='Retailer', data=region_retailer)
plt.title("Retailer Sales Contribution by Region")
plt.ylabel("Total Sales ($)")
plt.xlabel("Region")
plt.legend(title="Retailer")
plt.show()


# Sales split by product category per retailer

product_mix = df.groupby(['Retailer','Product'])['Total Sales($)'].sum().reset_index()
print(product_mix)
plt.figure(figsize=(12,6))
sns.barplot(x='Retailer', y='Total Sales($)', hue='Product', data=product_mix)
plt.title("Product Mix by Retailer")
plt.ylabel("Total Sales ($)")
plt.xlabel("Retailer")
plt.legend(title="Product")
plt.show()



# Retailer-wise sales method comparison

sales_method = df.groupby(['Retailer','Sales Method'])['Total Sales($)'].sum().reset_index()
print(sales_method)
plt.figure(figsize=(12,6))
sns.barplot(x='Retailer', y='Total Sales($)', hue='Sales Method', data=sales_method)
plt.title("Sales Method Contribution by Retailer")
plt.ylabel("Total Sales ($)")
plt.xlabel("Retailer")
plt.legend(title="Sales Method")
plt.show()

Retailer
Sports Direct    0.444892
Kohl's           0.419291
Foot Locker      0.417933
Amazon           0.417871
West Gear        0.417856
Walmart          0.406534
Name: Operating Margin, dtype: float64

               Units Sold  Operating Profit($)
Retailer                                      
West Gear          625262             85667968
Foot Locker        604369             80722234
Sports Direct      557640             74333022
Kohl's             287375             36811302
Amazon             197990             28818533
Walmart            206225             25782063

       Region       Retailer  Total Sales($)
0     Midwest         Amazon        16835873
1     Midwest    Foot Locker        47987394
2     Midwest         Kohl's        22229415
3     Midwest  Sports Direct        26207191
4     Midwest      West Gear        22540586
5   Northeast         Amazon        36262590
6   Northeast    Foot Locker        65326474
7   Northeast         Kohl's        14031168
8   Northeast  Sports Direct        24698097
9   Northeast        Walmart        13712005
10  Northeast      West Gear        32293733
11      South         Amazon          409091
12      South    Foot Locker         9307025
13      South         Kohl's         3552055
14      South  Sports Direct        65258121
15      South        Walmart        33049858
16      South      West Gear        33087031
17  Southeast         Amazon        10826333
18  Southeast    Foot Locker        59669118
19  Southeast  Sports Direct        54178543
20  Southeast        Walmart        21005539
21  Southeast      West Gear        17491703
22       West         Amazon        13365025
23       West    Foot Locker        37804709
24       West         Kohl's        62302115
25       West  Sports Direct        12129045
26       West        Walmart         6791008
27       West      West Gear       137551280

         Retailer                    Product  Total Sales($)
0          Amazon              Men's Apparel        10474770
1          Amazon    Men's Athletic Footwear        12011959
2          Amazon      Men's Street Footwear        22161652
3          Amazon            Women's Apparel        15710639
4          Amazon  Women's Athletic Footwear         7935255
5          Amazon    Women's Street Footwear         9404637
6     Foot Locker              Men's Apparel        29508995
7     Foot Locker    Men's Athletic Footwear        36480415
8     Foot Locker      Men's Street Footwear        57481575
9     Foot Locker            Women's Apparel        43296114
10    Foot Locker  Women's Athletic Footwear        24239624
11    Foot Locker    Women's Street Footwear        29087997
12         Kohl's              Men's Apparel        14164965
13         Kohl's    Men's Athletic Footwear        17885316
14         Kohl's      Men's Street Footwear        22421073
15         Kohl's            Women's Apparel        20315239
16         Kohl's  Women's Athletic Footwear        12493869
17         Kohl's    Women's Street Footwear        14834291
18  Sports Direct              Men's Apparel        24825286
19  Sports Direct    Men's Athletic Footwear        30889734
20  Sports Direct      Men's Street Footwear        38672270
21  Sports Direct            Women's Apparel        37113257
22  Sports Direct  Women's Athletic Footwear        23145288
23  Sports Direct    Women's Street Footwear        27825162
24        Walmart              Men's Apparel        11490680
25        Walmart    Men's Athletic Footwear        13600404
26        Walmart      Men's Street Footwear        14924255
27        Walmart            Women's Apparel        15316099
28        Walmart  Women's Athletic Footwear         8762792
29        Walmart    Women's Street Footwear        10464180
30      West Gear              Men's Apparel        33263936
31      West Gear    Men's Athletic Footwear        42805852
32      West Gear      Men's Street Footwear        53165419
33      West Gear            Women's Apparel        47287512
34      West Gear  Women's Athletic Footwear        30055068
35      West Gear    Women's Street Footwear        36386546

         Retailer Sales Method  Total Sales($)
0          Amazon     In-store        22366250
1          Amazon       Online        28909731
2          Amazon       Outlet        26422931
3     Foot Locker     In-store        76525000
4     Foot Locker       Online        72943290
5     Foot Locker       Outlet        70626430
6          Kohl's     In-store        29566250
7          Kohl's       Online        30992229
8          Kohl's       Outlet        41556274
9   Sports Direct     In-store        55048500
10  Sports Direct       Online        59225997
11  Sports Direct       Outlet        68196500
12        Walmart     In-store        16411250
13        Walmart       Online        15069494
14        Walmart       Outlet        43077666
15      West Gear     In-store       156726500
16      West Gear       Online        40532141
17      West Gear       Outlet        45705692

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 1. Revenue by Product
revenue_by_product = df.groupby('Product')['Total Sales($)'].sum().sort_values(ascending=False)

plt.figure(figsize=(10,5))
ax = revenue_by_product.plot(kind='bar', color='skyblue')
plt.title("Revenue Contribution by Product")
plt.ylabel("Total Sales ($)")
plt.xticks(rotation=45, ha='right')

# Annotation
for i, v in enumerate(revenue_by_product):
    ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.show()



# 2. Profit by Product
profit_by_product = df.groupby('Product')['Operating Profit($)'].sum().sort_values(ascending=False)

plt.figure(figsize=(10,5))
ax = profit_by_product.plot(kind='bar', color='lightgreen')
plt.title("Profit Contribution by Product")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')
# Annotation
for i, v in enumerate(profit_by_product):
    ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()



# 3. Units Sold vs State (Heatmap)
top_states = df.groupby('State')['Units Sold'].sum().nlargest(10).index
filtered_df = df[df['State'].isin(top_states)]

heatmap_data = filtered_df.pivot_table(
    values='Units Sold',
    index='State',
    columns='Product',
    aggfunc='sum',
    fill_value=0
)
plt.figure(figsize=(12,6))
sns.heatmap(heatmap_data, cmap="YlGnBu", annot=True, fmt="d", cbar_kws={'label': 'Units Sold'})
plt.title("Units Sold by Product in Top 10 States")
plt.xlabel("Product")
plt.ylabel("State")
plt.tight_layout()
plt.show()


print()

# 4. Loss-Making Products
df['Invoice Date'] = pd.to_datetime(df['Invoice Date'], errors='coerce')

loss_products = df.groupby('Product')['Operating Profit($)'].sum()
loss_products = loss_products[loss_products < 0].sort_values()

if loss_products.empty:
    print("No loss-making products found.")
else:
    plt.figure(figsize=(10,6))
    ax = loss_products.plot(kind='bar', color='red')
    plt.title("Loss-Making Products by Operating Profit")
    plt.ylabel("Operating Profit ($)")
    plt.xlabel("Product")
    plt.xticks(rotation=45, ha='right')

    # Annotation
    for i, v in enumerate(loss_products):
        ax.text(i, v, f'{int(v):,}', ha='center', va='top' if v < 0 else 'bottom', fontsize=9)

    plt.tight_layout()
    plt.show()

print()
print()


# 5. Highest Operating Margin
margin_by_product = df.groupby('Product')['Operating Margin'].mean().sort_values(ascending=False)

# Print values before plotting
print("Average Operating Margin by Product:")
print(margin_by_product)

plt.figure(figsize=(10,5))
margin_by_product.plot(kind='bar', color='orange')
plt.title("Average Operating Margin by Product")
plt.ylabel("Operating Margin (%)")
plt.xticks(rotation=45, ha='right')
plt.show()



# 6. Average Order Value (AOV) by Product
aov_by_product = df.groupby('Product')['Total Sales($)'].sum() / df.groupby('Product')['Invoice Date'].nunique()
aov_by_product = aov_by_product.sort_values(ascending=False)

# Print values before plotting
print("\nAverage Order Value (AOV) by Product:")
print(aov_by_product)

plt.figure(figsize=(10,5))
aov_by_product.plot(kind='bar', color='purple')
plt.title("Average Order Value (AOV) by Product")
plt.ylabel("AOV ($)")
plt.xticks(rotation=45, ha='right')
plt.show()



# 7. Seasonality in Product Sales
monthly_product_sales = df.groupby(['Months','Product'])['Total Sales($)'].sum().reset_index()

plt.figure(figsize=(12,6))
sns.lineplot(x='Months', y='Total Sales($)', hue='Product', data=monthly_product_sales, marker='o')
plt.title("Monthly Sales Trend by Product")
plt.ylabel("Total Sales ($)")
plt.xlabel("Month")
plt.show()



# 8. Sales method and the Product
# Group by Product and Sales Method → sum of Units Sold
product_sales_method = df.groupby(['Product', 'Sales Method'])['Units Sold'].sum().unstack(fill_value=0)

# Plot stacked bar chart
ax = product_sales_method.plot(
    kind='bar',
    stacked=True,
    figsize=(12,6),
    colormap="tab20"
)
plt.title("Sales Method Distribution by Product")
plt.xlabel("Product")
plt.ylabel("Units Sold")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Sales Method")
plt.tight_layout()

# --- Add annotations on each stacked bar section ---
for container in ax.containers:
    ax.bar_label(container, label_type="center", fontsize=8, color="black")
plt.show()



# 9. Top Products by Region
region_product_sales = df.groupby(['Region','Product'])['Total Sales($)'].sum().reset_index()
plt.figure(figsize=(12,6))
sns.barplot(x='Region', y='Total Sales($)', hue='Product', data=region_product_sales)
plt.title("Top Products by Region")
plt.ylabel("Total Sales ($)")
plt.xlabel("Region")
plt.show()

No loss-making products found.


Average Operating Margin by Product:
Product
Men's Street Footwear        0.446130
Women's Apparel              0.441318
Women's Athletic Footwear    0.424359
Men's Apparel                0.413225
Women's Street Footwear      0.410199
Men's Athletic Footwear      0.402702
Name: Operating Margin, dtype: float64

Average Order Value (AOV) by Product:
Product
Men's Street Footwear        1.373857e+06
Women's Apparel              1.147685e+06
Men's Athletic Footwear      9.850877e+05
Women's Street Footwear      8.258246e+05
Men's Apparel                7.880805e+05
Women's Athletic Footwear    6.924149e+05
dtype: float64

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# 1. Which retailer has the highest profit contribution?
profit_by_retailer = df.groupby('Retailer')['Operating Profit($)'].sum().sort_values(ascending=False)

plt.figure(figsize=(10,5))
ax = profit_by_retailer.plot(kind='bar', color="lightgreen")
plt.title("Profit Contribution by Retailer")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')

# Annotate values
for i, v in enumerate(profit_by_retailer):
    ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False)   # removes grid from left axis
plt.show()



# 2. Which regions are profitable vs unprofitable?
region_profit = df.groupby('Region')['Operating Profit($)'].sum().sort_values(ascending=False)

plt.figure(figsize=(8,5))
ax = region_profit.plot(kind='bar', color="skyblue")
plt.title("Profitability by Region")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')

# Annotate values
for i, v in enumerate(region_profit):
    ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False)   # removes grid from left axis

plt.show()



# 3. Hidden gems: Products with Low Sales but High Margins
product_summary = df.groupby('Product').agg({
    'Total Sales($)': 'sum',
    'Operating Margin': 'mean'
}).reset_index()

# Use median as threshold
sales_median = product_summary['Total Sales($)'].median()
margin_median = product_summary['Operating Margin'].median()

hidden_gems = product_summary[
    (product_summary['Total Sales($)'] < sales_median) &
    (product_summary['Operating Margin'] > margin_median)
]

print("Hidden Gem Products (Low Sales, High Margin):")
print(hidden_gems)
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Product Summary 
product_summary = df.groupby('Product').agg({
    'Total Sales($)': 'sum',
    'Operating Margin': 'mean'
}).reset_index()
product_summary['Margin_Display'] = product_summary['Operating Margin'].round(2)

x = np.arange(len(product_summary['Product']))
width = 0.4

fig, ax1 = plt.subplots(figsize=(12,6))

# --- Left axis: Total Sales ---
bars1 = ax1.bar(x - width/2, product_summary['Total Sales($)'], width, 
                label='Total Sales ($)', color='skyblue')
ax1.set_ylabel("Total Sales ($)", color="blue")
ax1.tick_params(axis="y", labelcolor="blue")

# --- Right axis: Operating Margin ---
ax2 = ax1.twinx()
bars2 = ax2.bar(x + width/2, product_summary['Operating Margin'], width, 
                label='Operating Margin (%)', color='orange')
ax2.set_ylabel("Operating Margin (%)", color="orange")
ax2.tick_params(axis="y", labelcolor="orange")

# --- Annotations for Sales ---
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
             f'{int(height):,}', ha='center', va='bottom', fontsize=9)

# --- Annotations for Margin ---
for bar, val in zip(bars2, product_summary['Margin_Display']):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
             f'{val:.2f}%', ha='center', va='bottom', fontsize=9)

# --- Formatting ---
ax1.set_title("Product-wise Sales vs Operating Margin", fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(product_summary['Product'], rotation=45, ha='right')
ax1.grid(False)   # removes grid from left axis
ax2.grid(False)   # removes grid from right axis
fig.tight_layout()
plt.show()
print()



# 4. Risky Volume Play: Retailers with High Sales but Low Margins
retailer_summary = df.groupby('Retailer').agg({
    'Total Sales($)': 'sum',
    'Operating Margin': 'mean'
}).reset_index()

sales_median = retailer_summary['Total Sales($)'].median()
margin_median = retailer_summary['Operating Margin'].median()

risky_retailers = retailer_summary[
    (retailer_summary['Total Sales($)'] > sales_median) &
    (retailer_summary['Operating Margin'] < margin_median)
]

print("Risky Volume Play Retailers (High Sales, Low Margin):")
print(risky_retailers)
# --- 1. Retailer Summary ---
retailer_summary = df.groupby('Retailer').agg({
    'Total Sales($)': 'sum',
    'Operating Margin': 'mean'
}).reset_index()
retailer_summary['Margin_Display'] = retailer_summary['Operating Margin'].round(2)

x = np.arange(len(retailer_summary['Retailer']))
width = 0.4

fig, ax1 = plt.subplots(figsize=(12,6))

# --- Left axis: Total Sales ---
bars1 = ax1.bar(x - width/2, retailer_summary['Total Sales($)'], width, 
                label='Total Sales ($)', color='lightgreen')
ax1.set_ylabel("Total Sales ($)", color="green")
ax1.tick_params(axis="y", labelcolor="green")

# --- Right axis: Operating Margin ---
ax2 = ax1.twinx()
bars2 = ax2.bar(x + width/2, retailer_summary['Operating Margin'], width, 
                label='Operating Margin (%)', color='coral')
ax2.set_ylabel("Operating Margin (%)", color="red")
ax2.tick_params(axis="y", labelcolor="red")

# --- Annotations for Sales ---
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
             f'{int(height):,}', ha='center', va='bottom', fontsize=9)

# --- Annotations for Margin ---
for bar, val in zip(bars2, retailer_summary['Margin_Display']):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2, height + 0.02*height,
             f'{val:.2f}%', ha='center', va='bottom', fontsize=9)

# --- Formatting ---
ax1.set_title("Retailer-wise Sales vs Operating Margin", fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(retailer_summary['Retailer'], rotation=45, ha='right')
ax1.grid(False)
ax2.grid(False)
fig.tight_layout()
plt.show()



# 5. Which Sales Method is making more profit
method_profit = df.groupby('Sales Method')['Operating Profit($)'].sum().sort_values(ascending=False)

plt.figure(figsize=(8,5))
ax = method_profit.plot(kind='bar', color="orange")
plt.title("Profit Contribution by Sales Method")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')

# Annotate values
for i, v in enumerate(method_profit):
    ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False)   # removes grid from left axis

plt.show()



# 6. Which State is making more profit?
state_profit = df.groupby('State')['Operating Profit($)'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(12,6))
ax = state_profit.plot(kind='bar', color="teal")
plt.title("Top 10 States by Profit Contribution")
plt.ylabel("Operating Profit ($)")
plt.xticks(rotation=45, ha='right')

# Annotate values
for i, v in enumerate(state_profit):
    ax.text(i, v, f'{int(v):,}', ha='center', va='bottom', fontsize=9)
ax.grid(False)   # removes grid from left axis

plt.show()

Hidden Gem Products (Low Sales, High Margin):
                     Product  Total Sales($)  Operating Margin
4  Women's Athletic Footwear       106631896          0.424359

Risky Volume Play Retailers (High Sales, Low Margin):
    Retailer  Total Sales($)  Operating Margin
5  West Gear       242964333          0.417856

import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix
corr = numeric_df.corr()

# Plot heatmap
plt.figure(figsize=(10,7))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Analysis Heatmap", fontsize=14)
plt.show()

Retailer Analysis¶

Product Analysis¶

Profitabiity Analysis¶

Correlation Analysis¶